; Tetraskelion
; a 256-byte intro by rrrola <rrrola@gmail.com>
; greets to everyone who likes glowsticks

; Render ten layers of broken shiny circles.
; Uses a low-res mode with 16-bit colors.
; To make it faster, compute only half of the screen.

org 100h ; assume cs<=0x2a18 [0xfd..ff]=0 bx=0 sp=-2 si=0x100

;Timer multipliers.
T1 equ 27*4  ; rotation
T2 equ 19*4  ; scale
T3 equ -32*4 ; circle radius
T4 equ 23*4  ; color cycling

;Prepare segments:
; ss constants|stack|scratch
; ds cosine table
; fs color_mul/cosine table
; gs mirror buffer
; es screen
%define w(xx) word[byte bp+si-0x100+xx]
%define d(xx) dword[byte bp+si-0x100+xx]

  push 0xa000   ;=68 00 a0  scratch (word|dword), scale (qword)
                ; pop later to make sp=-4 in the loop below
  lds bp,[si-3] ; bp=0, ds=0x6800  table: cos
  mov ax,0x4f02
  mov fs,ax           ; fs=0x4f02  table: color_mul / cos()

;  fninit

;Precompute tables with 16384 entries: cosine, color_mul/cosine
COS_TAB:
  fild word[bp+di]  ; bx/4=[bp+di]=[-2]=angle (0 on init)
  fmul d(TABLE_STEP)
  fcos           ;; cos(angle/65536*2pi): adjust period to 2pi
  fst dword[bx]

  fldpi           ; color_mul = 3.14
  fdivrp st1,st0  ; color_mul / cos(...)
;  fidivr [bp+si+?] ; -1 byte, but I like color_mul = pi

  fstp dword[fs:bx]
  sub bx,sp       ; bx+=4
  inc word[bp+di] ; next angle
  jnz COS_TAB     ; bx=0 [-2]=0

  mov bx,0x10e
  int 10h    ; set mode 320x200 with 65536 colors; assume it's ok (ax=0x004f)
  pop es              ; es=0xa000  screen

  add ax,0x39c9   ; constant: 07 05 c9 39, should be "db 0f c9 39"
TABLE_STEP equ $-4 ;= 0.000383495197 = 2pi / 16384, also ~ 1 / (256 * pi^2)
;  mov gs,ax           ; gs=0x3a18  mirror buffer

; Frame loop: cx = time
M:

; Precompute the per-iteration scale.
  imul bx,cx,T2
  fld d(ZOOM)
  fsub dword[bx]        ;; zoom-cos(t2)
  fdiv d(ZOOM)          ;; (zoom-cos(t2))/zoom
  fstp qword[bp+si]     ;; scale = 1 - cos(t2)/zoom
  inc cx          ; time++
ZOOM equ $-4  ; =9.679

;Pixel loop: di = pixel index = pixel address / 2
X mov ax,0xcccd ; convert width 320 -> 65536
  mul di
  xchg ax,bx    ; full 16-bit precision of X
  mov ax,0x4f05
;  add bx,ax
;  adc dx,0x9b80 ; put center at [100, 159.5]: should add 0x9b804d46
  add dx,0x9b80 ; put center at [100, 159.5]: should add 0x9b804d46

  pusha ; [-18-16-14-12-10 -8 -6 -4] on the stack
        ;   di si bp sp bx dx cx ax
        ;                  yy tt
        ;                x x

;Is it time to set the VESA bank?
  add di,di     ; di=pixel address
  jnz D
  cwd
  adc dx,dx     ; dx=page (0 or 1)
  xor bx,bx     ; bh=0 (set bank), bl=0 (window id)
  int 10h       ; assume 64kB granularity and window at 0xA000

;Compute a new color or load it from the mirror buffer.
D: ;mov ax,[gs:di] ; load from mirror buffer
  ;jc COPY_MIRROR ; compute only the top half of the screen

  push di
  call IT        ; compute a new color
  pop di

;Set pixel color
;  sub bp,di      ; mirrored store to mirror buffer
;  mov [gs:bp+320*201*2-65536 - 2],ax
COPY_MIRROR:
  stosw
;  stosw   ; 2x faster, also adjust mirroring to -4
  popa
;  inc di  ; 2x faster
  inc di
  jnz X  ; di=0

;  call SCREENSHOT

;ESC check.
  in al,60h
  cmp al,1
  jne M     ; fallthrough, exit later

IT:

;Color accumulators = 0, load coordinates and radius.
Z fldz
  inc bp
  jpo Z   ; loop 3x, bp=3  ;; R=0 G=0 B=0

  dec bp
L dec bp            ; load y=[-9], x=[-8]
  fild word[bp-9]
  fadd st0          ;; x[-65536..65536] y R G B
  jpo L  ; loop 2x, bp=0 again, zero flag = 1

  jmp LEN           ; call LEN subroutine
LEN_RET:
  imul di,[bp+si],4 ; di = d = 65536/2pi * length(x,y)/2

  imul dx,cx,T4
  sub dx,di         ; dx = t4-d
  imul cx,T3
  add di,cx         ; di = d-t3

  mov ax,0x8000 + 10 ; al = number of iterations, ah=0x80 = fold offset

;Rotate.
; [x] = [C -S] * [x]
; [y]   [S  C]   [y]
I mov cl,0x25    ; RGB phase shift, later shift length

  ;cl: horrible parity hack for looping
  ; 25 00100101 o <- start
  ; 26 00100110 o        <- after Q
  ; 27 00100111 e   <- after R
  ; 28 00101000 e
  ; 29 00101001 o     <- after F

  imul bx,[bp-6],T1  ; bx = t1 (cx is used for looping)
R fld st1        ;; y x y R G B   | x Sy x Cy R G B
  fmul dword[bx] ;; Cy x y R G B  | Cx Sy x Cy R G B
  fxch st2       ;; y x Cy R G B  | x Sy Cx Cy R G B
  fmul dword[bx-0x4000]
  inc cx         ;; Sy x Cy R G B | Sx Sy Cx Cy R G B
  jpo R ; loop 2x: cl=0x27
  faddp st3,st0  ;; Sy Cx Sx+Cy R G B
  fsubp st1,st0  ;; x=Cx-Sy y=Sx+Cy R G B

;Scale, square fold.
F fmul qword[bp+si]  ; scale
  fistp dword[bp+si] ; wrap: keep only bottom 16 bits
  add [bp+si],ax     ;~0x8000, can also be xor (cf=0)
  fild word[bp+si] ;; x = x-round(x) | y = y-round(y)
  fxch st1
  inc cx
  jpe F ; loop 2x: cl=0x29, zero flag = 0

;Subroutine: compute length of 2D vector, scale to access cos table
LEN: ;; x y -> [bp+si] = sqrt(x*x+y*y)/65536/2 * 16384/2pi ~ sqrt(C * (x*x+y*y))
  fld st1
  fmul st0
  fld st1   ; -1 byte: cmc, jc LEN (but it's slow in DOSBox)
  fmul st0
  faddp
  fmul d(TABLE_STEP)  ; exact: (16384/2pi/65536/2)^2 = 0.000395785+
  fsqrt
  fistp word[bp+si]
  jz LEN_RET  ; return up if zero flag = 1

;Add colorful circles.

; k = color_mul / cos(5*length(x,y) + d - t3)
; [R G B] += k * ( 1 + 2 * cos(3*(i/40 + t4-d) + 0.9*[2 1 0]) );
  dec dh
  imul bx,dx,3      ; bx = q = 65536/2pi * 3*(i/40 + t4-d)
Q:add bh,0x25         ; q += ~ 0.9 * 65536/2pi
  ;add bh,cl         ; q += ~ 1.0 * 65536/2pi  ; -1 byte, I like the 0x25 colors better
  fld1
  fsub dword[bx]
  fsub dword[bx]    ;; 1+2cos(q) x y R G B
  dec cx ; cl=0x26
  jpe Q  ; loop 3x ;; [dR dG dB]=1+2cos(q+0.9*[2 1 0])) x y R G B  ; bp=3

  imul bx,[bp+si],10*4 ; 65536/2pi * 5*length(x,y)
K fmul dword[fs:bx+di] ; k = color_mul / cos(5*length(x,y) + d - t3)
  faddp st5,st0        ;; dG gB x y R+=k*dR G B
  add al,0x55  ; +55 +aa +ff(=-1),carry
  jnc K ; loop 3x     ;; x y R+=k*dR G+=k*dG B+=k*dB

  jnz I ; al=0, cl=0x26

  fcompp            ;; R G B

; Assemble R,G,B into 16-bit high color (5+6+5 bits). Clamp to 0..0xffff.
  inc ax  ; ax=0x8001
A fmul st0          ;; R^2 G^2 B^2
  fistp word[bp+si] ; if it's > 0x7fff, clamp to 0x8000
  imul bx,[bp+si],2 ; double, set carry if it was > 0x3fff
  sbb bx,bp         ; overflow -> 0xffff
  xor cl,5^6        ; cl & 0x1f = shift length: flip 5<->6
  shld ax,bx,cl ; rrrrrggggggbbbbb
  jnc A ; loop 3x until you shift the 1 bit out of ax

  ret

;%include "screenshot320_16.inc"
